library(tidyverse)
library(tidymodels)
library(modeltime)
library(sf)
library(leaflet)
NYPD_SHOOTING_URL <- "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD"
Below, we import the shooting dataset.
shooting_raw_df <-
NYPD_SHOOTING_URL %>%
read_csv(
col_types = cols(
INCIDENT_KEY = col_integer(),
OCCUR_DATE = col_date(format = "%m/%d/%Y"),
OCCUR_TIME = col_time(format = "%H:%M:%S"),
PRECINCT = col_integer(),
JURISDICTION_CODE = col_integer(),
STATISTICAL_MURDER_FLAG = col_logical(),
X_COORD_CD = col_double(),
Y_COORD_CD = col_double(),
Latitude = col_double(),
Longitude = col_double(),
.default = col_character()
)
) %>%
janitor::clean_names()
We start with a summary of the dataset:
shooting_raw_df %>%
summary()
## incident_key occur_date occur_time boro
## Min. : 9953245 Min. :2006-01-01 Length:28562 Length:28562
## 1st Qu.: 65439914 1st Qu.:2009-09-04 Class1:hms Class :character
## Median : 92711254 Median :2013-09-20 Class2:difftime Mode :character
## Mean :127405824 Mean :2014-06-07 Mode :numeric
## 3rd Qu.:203131993 3rd Qu.:2019-09-29
## Max. :279758069 Max. :2023-12-29
##
## loc_of_occur_desc precinct jurisdiction_code loc_classfctn_desc
## Length:28562 Min. : 1.0 Min. :0.0000 Length:28562
## Class :character 1st Qu.: 44.0 1st Qu.:0.0000 Class :character
## Mode :character Median : 67.0 Median :0.0000 Mode :character
## Mean : 65.5 Mean :0.3219
## 3rd Qu.: 81.0 3rd Qu.:0.0000
## Max. :123.0 Max. :2.0000
## NA's :2
## location_desc statistical_murder_flag perp_age_group
## Length:28562 Mode :logical Length:28562
## Class :character FALSE:23036 Class :character
## Mode :character TRUE :5526 Mode :character
##
##
##
##
## perp_sex perp_race vic_age_group vic_sex
## Length:28562 Length:28562 Length:28562 Length:28562
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## vic_race x_coord_cd y_coord_cd latitude
## Length:28562 Min. : 914928 Min. :125757 Min. :40.51
## Class :character 1st Qu.:1000068 1st Qu.:182912 1st Qu.:40.67
## Mode :character Median :1007772 Median :194901 Median :40.70
## Mean :1009424 Mean :208380 Mean :40.74
## 3rd Qu.:1016807 3rd Qu.:239814 3rd Qu.:40.82
## Max. :1066815 Max. :271128 Max. :40.91
## NA's :59
## longitude lon_lat
## Min. :-74.25 Length:28562
## 1st Qu.:-73.94 Class :character
## Median :-73.92 Mode :character
## Mean :-73.91
## 3rd Qu.:-73.88
## Max. :-73.70
## NA's :59
shooting_raw_df %>%
glimpse()
## Rows: 28,562
## Columns: 21
## $ incident_key <int> 244608249, 247542571, 84967535, 202853370, 270…
## $ occur_date <date> 2022-05-05, 2022-07-04, 2012-05-27, 2019-09-2…
## $ occur_time <time> 00:10:00, 22:20:00, 19:35:00, 21:00:00, 21:00…
## $ boro <chr> "MANHATTAN", "BRONX", "QUEENS", "BRONX", "BROO…
## $ loc_of_occur_desc <chr> "INSIDE", "OUTSIDE", NA, NA, NA, NA, NA, NA, N…
## $ precinct <int> 14, 48, 103, 42, 83, 23, 113, 77, 48, 49, 73, …
## $ jurisdiction_code <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ loc_classfctn_desc <chr> "COMMERCIAL", "STREET", NA, NA, NA, NA, NA, NA…
## $ location_desc <chr> "VIDEO STORE", "(null)", NA, NA, NA, "MULTI DW…
## $ statistical_murder_flag <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, …
## $ perp_age_group <chr> "25-44", "(null)", NA, "25-44", "25-44", NA, N…
## $ perp_sex <chr> "M", "(null)", NA, "M", "M", NA, NA, NA, NA, "…
## $ perp_race <chr> "BLACK", "(null)", NA, "UNKNOWN", "BLACK", NA,…
## $ vic_age_group <chr> "25-44", "18-24", "18-24", "25-44", "25-44", "…
## $ vic_sex <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ vic_race <chr> "BLACK", "BLACK", "BLACK", "BLACK", "BLACK", "…
## $ x_coord_cd <dbl> 986050, 1016802, 1048632, 1014493, 1009149, 99…
## $ y_coord_cd <dbl> 214231.0, 250581.0, 198262.0, 242565.0, 190104…
## $ latitude <dbl> 40.75469, 40.85440, 40.71063, 40.83242, 40.688…
## $ longitude <dbl> -73.99350, -73.88233, -73.76777, -73.89071, -7…
## $ lon_lat <chr> "POINT (-73.9935 40.754692)", "POINT (-73.8823…
A few items we will need to address in cleaning:
occur_date and
occur_time with a single occur_datetime value
for ease of analysisboro*_age_group*_sex*_race*_age_group has both
"UNKNOWN" and NA_character_ values present. We
should collapse these. See plots below.*_age_group also containsshooting_raw_df %>%
select(boro, ends_with("_age_group"), ends_with("_sex"), ends_with("_race")) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_bar() +
facet_wrap(facets = vars(variable), nrow = 2, scales = "free_x") +
xlab(NULL) +
ylab(NULL) +
theme(axis.text.x.bottom = element_text(angle = 45, hjust = 1, size = 6))
shooting_clean_df <-
shooting_raw_df %>%
mutate(
occur_datetime = lubridate::make_datetime(
year = year(occur_date),
month = month(occur_date),
day = day(occur_date),
hour = hour(occur_time),
min = minute(occur_time),
sec = second(occur_time),
tz = "America/New_York"
),
boro =
boro %>%
as_factor(),
across(
ends_with("_age_group"),
~ coalesce(., "UNKNOWN") %>%
as_factor() %>%
fct_collapse(UNKNOWN = c("UNKNOWN", "(null)")) %>%
fct_relevel("<18", "18-24", "25-44", "45-64", "65+")
),
across(
c(ends_with("_sex"), ends_with("_race")),
~ coalesce(., "U") %>%
as_factor() %>%
fct_collapse(U = c("U", "(null)"))
)
)
Crime, especially in large cities like New York, has been a hot topic in American political discourse lately. In New York’s case, it’s common to see news coverage (particularly from media outlets broadly considered conservative) citing a widespread malaise among New York residents about the levels of violent crime in the city.
Meanwhile, other coverage (usually from media outlets broadly considered liberal) cite declining levels of violent crime since the pandemic.
With all of this in mind, in order to better understand trends in violent crime NYC, particularly during and after the 2020 COVID pandemic, I aim to analyze the rates of shootings in the provided dataset over time, with particular attention paid to the time period from 2017 to the present. Additionally, I aim to identify any differences in crime rates by geography and look for correlations with voting patterns from the 2020 presidential election, which may give a more detailed picture of how violent crime is evolving and suggest how different communities may be perceiving this crime.
Below, we show a time series of the number of shootings per month.
The data appear highly seasonal within the year, so we also show the trend component of an STL decomposition as the “seasonally adjusted” number of shootings.
shootings_monthly_df <-
shooting_clean_df %>%
mutate(occur_month = lubridate::floor_date(occur_datetime, unit = "month")) %>%
group_by(occur_month) %>%
summarize(num_incidents = n())
stl_model_spec <-
seasonal_reg(mode = "regression", seasonal_period_1 = "1 year") %>%
set_engine("stlm_arima")
stl_model_fit <-
stl_model_spec %>%
fit(num_incidents ~ occur_month, data = shootings_monthly_df)
shootings_monthly_df %>%
mutate(trend = stl_model_fit$fit$models$model_1$stl[,"Trend"] %>% as.numeric()) %>%
pivot_longer(cols = c(num_incidents, trend), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = occur_month)) +
geom_line(aes(y = value, color = variable)) +
geom_hline(yintercept = 0, color = "white", size = 1.5) +
scale_x_datetime(breaks = breaks_width(width = "2 year"), labels = label_date(format = "%Y")) +
scale_color_manual(
name = NULL,
values = c("num_incidents" = "black", "trend" = "red"),
labels = c("num_incidents" = "Observed", "trend" = "Seasonally Adj."),
) +
labs(
title = "Monthly Shooting Incidents in NYC",
subtitle = "Shootings near 15-year low following a sharp rise during COVID"
) +
xlab(NULL) +
ylab("No. of Incidents") +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
seasonal_df <-
shootings_monthly_df %>%
mutate(
seasonal = stl_model_fit$fit$models$model_1$stl[,"Seasonal12"] %>% as.numeric(),
month = month(occur_month) %>% as.integer(),
month_label = month(occur_month, label = TRUE),
occur_year = year(occur_month)
)
seasonal_df %>%
ggplot(aes(x = occur_year, y = seasonal)) +
geom_hline(yintercept = 0, color = "white", size = 1.5) +
geom_line(aes(color = month, group = month)) +
geom_point(aes(color = month)) +
ggrepel::geom_text_repel(
aes(x = occur_year, y = seasonal, label = month_label, color = month),
data = filter(seasonal_df, occur_year == max(occur_year)),
nudge_x = 1,
segment.color = NA
) +
scale_x_continuous(breaks = breaks_width(2)) +
scale_y_continuous(breaks = breaks_width(20)) +
scale_color_viridis_c() +
xlab("Year") +
ylab("Seasonal Component") +
labs(
title = "Seasonal Component, Monthly Shootings",
subtitle = "Shootings peak in the summer months",
) +
theme(legend.position = "none")
What’s clear from these visualizations is that shooting incidents came down steadily from 2006 to 2020, spiked during the COVID pandemic, and have fallen more or less to pre-pandemic levels since.
The Upshot at the New York Times has compiled extremely detailed election results for the 2020 presidential election, which includes precinct-level data for the five boroughs of New York City. Using this data, we can correlate voting patterns and shooting incidents within smaller geographies in the city. See the GitHub link above for more information. For the purposes of this analysis, the relevant fields in the data are
votes_dem: the number of votes for the Democratic
candidate (Joe Biden)votes_rep: the number of votes for the Republican
candidate (Donald Trump)votes_total: the total number of votes cast for
president in the precinctpct_dem_lead: the margin by which the Democratic
candidate led the Republican candidate in that precinct (calculated as
(votes_dem - votes_rep) / votes_total)nyc_borough_fips_codes <- c(
"36061", # Manhattan
"36005", # The Bronx
"36047", # Brooklyn
"36081", # Queens
"36085" # Staten Island
)
nyc_election_results <-
## Downloaded from
## https://int.nyt.com/newsgraphics/elections/map-data/2020/national/precincts-with-results.geojson.gz
here::here("data/election-results.geojson") %>%
geojsonsf::geojson_sf() %>%
janitor::clean_names() %>%
mutate(fips_code = str_split_i(geoid, "-", 1)) %>%
filter(fips_code %in% nyc_borough_fips_codes)
Below, we plot the difference in incidents per year for the periods 2020-2021 (the pandemic) and 2016-2019 (pre-pandemic) by voting precinct. There appears to be very little correlation between presidential election results at the precinct level and changes in the rate of shooting incidents.
A simple linear model of this difference as a function of
pct_dem_lead shows a statistically significant but
infinitesimal positive correlation.
sf_use_s2(FALSE)
shooting_sf <-
shooting_clean_df %>%
mutate(location_geo = purrr::map2(longitude, latitude, ~ st_point(x = c(.x, .y)))) %>%
st_as_sf() %>%
st_set_crs(4326)
shooting_precinct <-
nyc_election_results %>%
st_join(
shooting_sf,
join = st_intersects
) %>%
as_tibble() %>%
select(incident_key, geoid, occur_datetime)
shootings_2016_2019_by_precinct <-
shooting_precinct %>%
filter(year(occur_datetime) %in% 2016:2019) %>%
group_by(geoid) %>%
summarize(n_incidents_2016_2019 = sum(!is.na(incident_key)))
shootings_2020_2021_by_precinct <-
shooting_precinct %>%
filter(year(occur_datetime) %in% 2020:2021) %>%
group_by(geoid) %>%
summarize(n_incidents_2020_2021 = sum(!is.na(incident_key)))
shootings_diff_by_precinct <-
nyc_election_results %>%
as_tibble() %>%
left_join(shootings_2016_2019_by_precinct, by = "geoid") %>%
left_join(shootings_2020_2021_by_precinct, by = "geoid") %>%
mutate(
across(starts_with("n_incidents"), ~ coalesce(., 0)),
diff = n_incidents_2020_2021 / 2 - n_incidents_2016_2019 / 4
)
linear_model <-
linear_reg() %>%
fit(diff ~ pct_dem_lead, data = shootings_diff_by_precinct)
linear_model %>%
broom::tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0425 0.0157 2.70 6.91e- 3
## 2 pct_dem_lead 0.00230 0.000232 9.94 4.28e-23
shootings_diff_by_precinct %>%
mutate(model_pred = predict(linear_model, new_data = shootings_diff_by_precinct)$.pred) %>%
ggplot(aes(pct_dem_lead)) +
geom_hex(aes(y = diff)) +
geom_hline(yintercept = 0, size = 1, color = "white") +
geom_vline(xintercept = 0, size = 1, color = "white") +
geom_line(aes(y = model_pred), color = "red", size = 0.5) +
scale_fill_viridis_c(name = "# precincts") +
labs(
title = "Change in Shootings/Yr from 2016-19 to 2020-21 vs. Dem. Margin by Precinct",
subtitle = "Virtually no correlation exists",
x = "Dem. Margin",
y = "Shootings/Yr"
) +
theme(
legend.position = "bottom",
legend.key.height = unit(2, "mm"),
legend.key.width = unit(2, "cm"),
legend.title.position = "bottom"
)
If we look at a map of the data, it’s clear that changes in rates of shootings are highly localized. There are only a handful of precincts where shooting rates shot way up during the pandemic, and even a few precincts where shooting rates went way down, with most precincts staying at roughly the same rates.
make_shooting_label <- function(diff) {
str_glue("<br>Change in Shootings per Year: {round(diff, 2)}</br>")
}
precinct_shooting_map_data <-
shootings_diff_by_precinct %>%
left_join(nyc_election_results %>% select(geoid, geometry), by = "geoid") %>%
mutate(diff = coalesce(diff, 0))
domain <- c(min(precinct_shooting_map_data$diff), max(precinct_shooting_map_data$diff))
lower_palette <- colorRampPalette(c("blue", "#eeeeff"), space = "Lab")(abs(domain[1]))
upper_palette <- colorRampPalette(c("#ffefef", "orange"), space = "Lab")(domain[2])
shooting_palette <- c(lower_palette, upper_palette)
precinct_shooting_map_data %>%
st_as_sf() %>%
leaflet(options = leafletOptions(minZoom = 10)) %>%
addTiles() %>%
addPolygons(
weight = 1,
color = ~get('colorBin')(shooting_palette, domain)(diff),
fillOpacity = 0.5,
label = ~map(make_shooting_label(diff), htmltools::HTML)
) %>%
addLegend(position = "bottomright", pal = colorBin(shooting_palette, domain), values = domain)
If we examine on the neighborhood level, we can see a much stronger correlation. Neighborhood names and boundaries used here are the ones provided by NYC OpenData. Areas in northern Manhattan, the Bronx, and Eastern Brooklyn seemed to experience the largest increases in shootings during the pandemic, whereas many areas of Staten Island and Queens saw decreases in shootings.
neighborhoods <-
# Downloaded from https://data.cityofnewyork.us/City-Government/Neighborhood-Names-GIS/99bc-9p23
here::here("data/neighborhood-boundaries.geojson") %>%
geojsonsf::geojson_sf() %>%
janitor::clean_names()
shooting_diff_by_neighborhood <-
neighborhoods %>%
st_join(shootings_diff_by_precinct %>% st_as_sf(), join = st_intersects) %>%
as_tibble() %>%
select(-votes_per_sqkm) %>%
group_by(ntacode, ntaname, geometry) %>%
summarize(
across(c(starts_with("n_incidents"), starts_with("votes_")), sum)
) %>%
ungroup() %>%
mutate(
diff = n_incidents_2020_2021 / 2 - n_incidents_2016_2019 / 4,
pct_dem_lead = 100 * (votes_dem - votes_rep) / votes_total
)
neighborhood_linear_model <-
linear_reg() %>%
fit(diff ~ pct_dem_lead, data = shooting_diff_by_neighborhood)
neighborhood_linear_model %>%
broom::tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.55 1.13 1.37 1.71e- 1
## 2 pct_dem_lead 0.142 0.0184 7.72 6.10e-13
shooting_diff_by_neighborhood %>%
mutate(model_pred = predict(neighborhood_linear_model, new_data = shooting_diff_by_neighborhood)$.pred) %>%
ggplot(aes(pct_dem_lead)) +
geom_hex(aes(y = diff)) +
geom_hline(yintercept = 0, size = 1, color = "white") +
geom_vline(xintercept = 0, size = 1, color = "white") +
geom_line(aes(y = model_pred), color = "red", size = 0.5) +
scale_fill_viridis_c(name = "# neighborhoods") +
labs(
title = "Change in Shootings/Yr from 2016-19 to 2020-21 vs. Dem. Margin by Neighborhood",
subtitle = "Increased Dem Support Correlated with Increased Shootings",
x = "Dem. Margin",
y = "Shootings/Yr"
) +
theme(
legend.position = "bottom",
legend.key.height = unit(2, "mm"),
legend.key.width = unit(2, "cm"),
legend.title.position = "bottom"
)
domain <- c(min(shooting_diff_by_neighborhood$diff), max(shooting_diff_by_neighborhood$diff))
lower_palette <- colorRampPalette(c("blue", "#eeeeff"), space = "Lab")(abs(domain[1]))
upper_palette <- colorRampPalette(c("#ffefef", "orange"), space = "Lab")(domain[2])
shooting_palette <- c(lower_palette, upper_palette)
make_neighborhood_label <- function(votes_dem, votes_rep, pct_dem_lead, name) {
margin <-
if_else(
pct_dem_lead >= 0,
str_glue("D+{round(pct_dem_lead, 1)}%"),
str_glue("R+{round(-pct_dem_lead, 1)}%")
)
style <- if_else(
pct_dem_lead >= 0,
"color:blue;",
"color:red;"
)
str_glue("<h3>{name}</h3><p>Votes for Biden: {votes_dem}</p><p>Votes for Trump: {votes_rep}</p><p><b>Margin: <span style={style}>{margin}<span></b></p>")
}
shooting_diff_by_neighborhood %>%
sf::st_as_sf() %>%
leaflet() %>%
addTiles() %>%
addPolygons(
weight = 1,
color = "black",
fillColor = ~get('colorBin')(shooting_palette, domain)(diff),
fillOpacity = 0.5,
label = ~map(make_neighborhood_label(votes_dem, votes_rep, pct_dem_lead, ntaname), htmltools::HTML)
)
It’s particularly interesting to note here how much the results of this analysis can change depending on the grain of the geography chosen, which emphasizes just how localized a phenomenon crime really is.
I think what we can say confidently is that discussing “levels of crime” in a geography as large as New York City is a fraught exercise. The rates of violence taking place have varied on a scale as small as a few blocks. For this reason, it’s difficult to say exactly how, if at all, shooting rates affected voting patterns in New York.
With crime data, there are plenty of potential sources of bias. These are a couple that come to mind, but domain experts will surely be able to identify more:
Additionally, my own personal biases certainly affect the analysis here. Crime is a heavily politicized topic of study, and my decision to analyze crime in New York through the lens of politics was definitely motivated by my personal views My perception of crime in New York is also colored by my personal experiences there - I lived in Hell’s Kitchen for a year and a half from 2021-2022. In order to mitigate these sources of bias, it was important to avoid jumping to conclusions and only draw inferences evidenced in the data.
Finally, analyzing only violent crime leaves some holes in this analysis. It’s not clear how non-violent crimes, like theft, have evolved over this time period and may also contribute to political perceptions of crime in New York.